This notebook is a follow up notebook to previous benchmarking done in 02-compare-quants.RMD. Here, we have used the most recent release of Salmon and Cellranger 6.0.0, and included the new Alevin-Fry method. Alevin-Fry can use either a selective alignment strategy or pseudoalignment strategy (with the --sketch flag),so here we are testing both strategies. Alevin-Fry also can use the 10X whitelist for barcode assignment rather than generating the putative whitelist using the --unfiltered-pl flag, so we have also included that in our tests.
The resolution currently used for Alevin-Fry is the Full resolution. See the documentation for more information on different options for resolution that could be used.
The goal of this notebook is to compare the quantification performed by each tool across the samples we have currently run. Thus far, we have 4 10Xv3 single cell RNA-seq samples that have been run on all tools, SCPCR000003, SCPCR000006, SCPCR000126, and SCPCR000127. We also have 2 10Xv3.1 single nucleus RNA-seq samples that have been run on all tools, SCPCR000118 and SCPCR000119. All the snRNA-seq samples have been run using either the pre-mRNA index or the mRNA index. One thing to note is that the new spliced index is different from the previously used index’s with the single cell RNA seq samples in that it was created by grabbing the genomic regions corresponding to spliced cDNA only from the ensembl gtf, rather than taking the cDNA.fasta directly from ensembl.
library(tidyverse)
library(ggplot2)
library(SingleCellExperiment)
library(ggforce)
# where all the data lives after running 01-import-quant-data.Rmd
import_dir <- file.path('data')
quant_info_file <- file.path(import_dir, 'quant_info.tsv')
alevin_rds <- file.path(import_dir, 'alevin_sces.rds')
alevin_fry_rds <- file.path(import_dir, 'alevin_fry_sces.rds')
alevin_fry_sketch_rds <- file.path(import_dir, 'alevin_fry_sketch_sces.rds')
alevin_fry_unfiltered_rds <- file.path(import_dir, 'alevin_fry_unfiltered_sces.rds')
alevin_fry_unfiltered_sketch_rds <- file.path(import_dir, 'alevin_fry_unfiltered_sketch_sces.rds')
kallisto_rds <- file.path(import_dir, 'kallisto_sces.rds')
cellranger_rds <- file.path(import_dir, 'cellranger_sces.rds')
# read in quant info file created in 01-import-quant-data.Rmd
quant_info <- readr::read_tsv(quant_info_file)
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
tool = col_character(),
quant_dir = col_character(),
sample = col_character(),
index_type = col_character(),
kmer = col_character(),
decoy = col_character(),
alevin_alignment = col_character(),
alevin_permit_list = col_character(),
scpca_sample_id = col_character(),
seq_unit = col_character(),
technology = col_character()
)
# make sure that the tool is labeled as alevin_fry_sketch/ unfiltered to distinguish from regular alevin_fry for later
quant_info[which(quant_info$alevin_alignment == "sketch" & quant_info$alevin_permit_list != "unfiltered"),"tool"] <- "alevin_fry_sketch"
quant_info[which(quant_info$alevin_alignment == "salign" & quant_info$alevin_permit_list == "unfiltered"), "tool"] <- "alevin_fry_unfiltered"
quant_info[which(quant_info$alevin_alignment == "sketch" & quant_info$alevin_permit_list == "unfiltered"), "tool"] <- "alevin_fry_unfiltered_sketch"
# save changes to quant file
quant_info_file_update <- file.path(import_dir, 'quant_info_update.tsv')
readr::write_tsv(quant_info, file.path(quant_info_file_update))
# Each RDS file contains a SingleCellExperiment
alevin_sces <- readr::read_rds(alevin_rds)
alevin_fry_sces <- readr::read_rds(alevin_fry_rds)
alevin_fry_sketch_sces <- readr::read_rds(alevin_fry_sketch_rds)
alevin_fry_unfiltered_sces <- readr::read_rds(alevin_fry_unfiltered_rds)
alevin_fry_unfiltered_sketch_sces <- readr::read_rds(alevin_fry_unfiltered_sketch_rds)
kallisto_sces <- readr::read_rds(kallisto_rds)
cellranger_sces <- readr::read_rds(cellranger_rds)
## get all annotation data from S3
annotation_dir <- file.path(import_dir, "annotation")
## get mitochondrial gene list from s3
annotation_files_s3 <-
's3://nextflow-ccdl-data/reference/homo_sapiens/ensembl-100/annotation/'
sync_call <- paste('aws s3 sync', annotation_files_s3, annotation_dir)
system(sync_call, ignore.stdout = TRUE)
mito_file <- file.path(annotation_dir, "Homo_sapiens.ensembl.100.mitogenes.txt")
mito_genes <- readr::read_tsv(mito_file, col_names = "gene_id")
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
gene_id = col_character()
)
mito_genes <- mito_genes %>%
pull(gene_id)
addPerCellQC_mito <- function(sce, mito = mito_genes){
# add per cell QC with mitochondrial genes separately for later comparisons
scater::addPerCellQC(
sce,
subsets = list(mito = mito[mito %in% rownames(sce)])
)
}
alevin_sces <- alevin_sces %>%
purrr::map(addPerCellQC_mito)
alevin_fry_sces <- alevin_fry_sces %>%
purrr::map(addPerCellQC_mito)
alevin_fry_sketch_sces <- alevin_fry_sketch_sces %>%
purrr::map(addPerCellQC_mito)
alevin_fry_unfiltered_sces <- alevin_fry_unfiltered_sces %>%
purrr::map(addPerCellQC_mito)
alevin_fry_unfiltered_sketch_sces <- alevin_fry_unfiltered_sketch_sces %>%
purrr::map(addPerCellQC_mito)
kallisto_sces <- kallisto_sces %>%
purrr::map(addPerCellQC_mito)
cellranger_sces <- cellranger_sces %>%
purrr::map(addPerCellQC_mito)
Loading required package: HDF5Array
Loading required package: DelayedArray
Loading required package: Matrix
Attaching package: ‘Matrix’
The following object is masked from ‘package:S4Vectors’:
expand
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Attaching package: ‘DelayedArray’
The following object is masked from ‘package:purrr’:
simplify
The following objects are masked from ‘package:base’:
aperm, apply, rowsum
Loading required package: rhdf5
## merge all QC output into one data frame to work with for plotting comparisons
celldata_to_df <- function(sce){
# extract the column (cell) summary data from a SCE,
# convert to data frame and move cell id to a column
as.data.frame(colData(sce)) %>%
tibble::rownames_to_column(var = "cell_id") %>%
# add a column to get # of cells detected in that sample
mutate(cells_detected = ncol(sce))
}
alevin_cell_qc <- alevin_sces %>%
purrr::map_df(celldata_to_df, .id = "quant_id")
alevin_fry_cell_qc <- alevin_fry_sces %>%
purrr::map_df(celldata_to_df, .id = "quant_id")
alevin_fry_sketch_cell_qc <- alevin_fry_sketch_sces %>%
purrr::map_df(celldata_to_df, .id = "quant_id")
alevin_fry_unfiltered_cell_qc <- alevin_fry_unfiltered_sces %>%
purrr::map_df(celldata_to_df, .id = "quant_id")
alevin_fry_unfiltered_sketch_cell_qc <- alevin_fry_unfiltered_sketch_sces %>%
purrr::map_df(celldata_to_df, .id = "quant_id")
kallisto_cell_qc <- kallisto_sces %>%
purrr::map_df(celldata_to_df, .id = "quant_id")
cellranger_cell_qc <- cellranger_sces %>%
purrr::map_df(celldata_to_df, .id = "quant_id")
# combine all the data frames into one
cell_qc <- dplyr::bind_rows(
alevin = alevin_cell_qc,
alevin_fry = alevin_fry_cell_qc,
alevin_fry_sketch = alevin_fry_sketch_cell_qc,
alevin_fry_unfiltered = alevin_fry_unfiltered_cell_qc,
alevin_fry_unfiltered_sketch = alevin_fry_unfiltered_sketch_cell_qc,
kallisto = kallisto_cell_qc,
cellranger = cellranger_cell_qc,
.id = "tool"
) %>%
dplyr::left_join(quant_info,
by = c("tool" = "tool",
"quant_id" = "quant_dir")) %>%
# remove extraneous run from 119 with txome index since we have the run with the spliced index
dplyr::filter(quant_id != "SCPCR000119-txome_k31") %>%
# remove duplicate runs of alevin-fry --knee on 118, 119
dplyr::filter(quant_id !="SCPCR000118-spliced_intron_txome_k31-salign") %>%
dplyr::filter(quant_id != "SCPCR000119-spliced_intron_txome_k31-salign")
Let’s first start by taking a look at some general information across all of our tools and how consistent each of these metrics is for all four of the samples. Here are the metrics we want to look at:
## make new data frame with mean of cells detected and standard dev
cells_detected_stats <- cell_qc %>%
group_by(sample, tool) %>%
summarise(cells_detected = mean(cells_detected))
`summarise()` has grouped output by 'sample'. You can override using the `.groups` argument.
cells_detected_stats
## first look at cells detected
ggplot(cells_detected_stats, aes(x = tool, y = cells_detected)) +
geom_jitter(mapping = aes(color = sample)) +
geom_violin() +
theme_classic() +
ylab("Total Cells Detected") +
xlab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Based on this, it looks like alevin-fry using the --knee-distance does not handle single-nuclei samples well. In contrast, the alevin-fry --unfiltered-pl and kallisto samples were filtered using DropletUtils::emptyDrops and they have much lower numbers for the same single-nuclei samples.
Let’s take a look at the mitochondrial content per cell across the tools and see if we see variation there.
ggplot(cell_qc, aes(x = tool, y = subsets_mito_percent, fill = tool)) +
geom_boxplot() +
facet_grid(sample ~ index_type) +
theme_classic() +
ylab("% Mito /Cell") +
theme(axis.ticks.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))
It looks like SCPCR000003 is a very poor sample across all tools and has a lot of dead cells. It might be a good idea to remove that from our comparisons and not consider this for benchmarking. Additionally, it looks like we can see that SCPCR000118 (snRNAseq) also has some variation, but the other samples tend to be fairly consistent.
# remove SCPCR000003 from cell_qc for the rest of the plotting
cell_qc_filter <- cell_qc %>%
dplyr::filter(sample != "SCPCR000003")
Now let’s look at the number of UMI’s detected/cell that we’ve removed our outlier sample.
# number of UMI's/sample
ggplot(cell_qc_filter, aes(x = tool, y = sum, fill = tool)) +
geom_boxplot() +
facet_grid(sample ~ index_type) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylab("UMI/cell") +
xlab("")
At first glance, SCPCR000118 and SCPCR000119 (the single nucleus RNA-seq samples) have very low values of UMI/cell. Is this because these samples are poor quality? Or is it a problem with the index? However, the samples don’t look any better with cellranger so it could be a sample issue or a common feature of snRNA-seq samples.
Let’s separate our plots to look at single cell vs. single nucleus RNA-seq.
# first look at single nucleus samples only
cell_qc_filter %>%
filter(seq_unit == "nucleus") %>%
ggplot(aes(x = tool, y = sum, fill = tool)) +
geom_boxplot() +
facet_grid(sample ~ index_type) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylab("UMI/cell") +
xlab("") +
coord_cartesian(ylim = c(0,10000))
In general, it looks like SCPCR000118 has lower coverage than 119. Cellranger, Kallisto, and alevin-fry-unfiltered-sketch (with the pre mRNA index) actually seem to be have similar UMIs/cell in both samples, looking at the pre-mRNA index.
# single cell samples only
cell_qc_filter %>%
filter(seq_unit == "cell") %>%
ggplot(aes(x = tool, y = sum, fill = tool)) +
geom_boxplot() +
facet_grid(sample ~ index_type) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylab("UMI/cell") +
xlab("")
In general, these all look quite similar to each other. Alevin seems to be the outlier here and varies across samples. Kallisto is also on the slightly higher end than cellranger.
I think keeping the single cell and single nuclei samples separate is helpful for visualization so I’m going to keep them separate from here on out.
Now let’s look at genes/cell?
cell_qc_filter %>%
filter(seq_unit == "nucleus") %>%
ggplot(aes(x = tool, y = detected, fill = tool)) +
geom_boxplot() +
facet_grid(sample ~ index_type) +
theme_classic() +
ylab("Genes Detected/Cell") +
theme(axis.ticks.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(c(0,6000))
Here, we see a bit more variation in genes/cell for the single nucleus samples than we did with the UMI/cell. It looks like kallisto and alevin-fry --unfiltered-pl with the --sketch mode looks comparable to cellranger. Alevin-fry-unfiltered-pl is on the higher end, and is consistently capturing more genes (not necessarily a good thing?). Alevin is showing a lot of variability in 119, but not 118.
# single cell samples only
cell_qc_filter %>%
filter(seq_unit == "cell") %>%
ggplot(aes(x = tool, y = detected, fill = tool)) +
geom_boxplot() +
facet_grid(sample ~ index_type) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylab("Genes/cell") +
xlab("")
Here we are seeing similar trends in genes/cell as we did with UMI/cell for the single cell samples. We do see that alevin tends to report high genes/cell in all but 1 of the scRNA-seq samples. That same sample was the one that alevin also showed high mito content for, so perhaps that is why 006 is not as high for alevin. Cellranger is on the low end for all single cell samples, with alevin-fry --unfiltered-pl being mostly comparable.
sessionInfo()
R version 4.0.5 (2021-03-31)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] HDF5Array_1.18.1 rhdf5_2.34.0 DelayedArray_0.16.3 Matrix_1.3-3 ggforce_0.3.3 SingleCellExperiment_1.12.0
[7] SummarizedExperiment_1.20.0 Biobase_2.50.0 GenomicRanges_1.42.0 GenomeInfoDb_1.26.7 IRanges_2.24.1 S4Vectors_0.28.1
[13] BiocGenerics_0.36.1 MatrixGenerics_1.2.1 matrixStats_0.58.0 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.6
[19] purrr_0.3.4 readr_1.4.0 tidyr_1.1.3 tibble_3.1.2 ggplot2_3.3.3 tidyverse_1.3.1
loaded via a namespace (and not attached):
[1] bitops_1.0-7 fs_1.5.0 lubridate_1.7.10 httr_1.4.2 tools_4.0.5 backports_1.2.1
[7] utf8_1.2.1 R6_2.5.0 irlba_2.3.3 vipor_0.4.5 DBI_1.1.1 colorspace_2.0-1
[13] rhdf5filters_1.2.1 withr_2.4.2 tidyselect_1.1.1 gridExtra_2.3 compiler_4.0.5 cli_2.5.0
[19] rvest_1.0.0 BiocNeighbors_1.8.2 xml2_1.3.2 labeling_0.4.2 scales_1.1.1 digest_0.6.27
[25] rmarkdown_2.8 XVector_0.30.0 scater_1.18.6 pkgconfig_2.0.3 htmltools_0.5.1.1 sparseMatrixStats_1.2.1
[31] dbplyr_2.1.1 rlang_0.4.11 readxl_1.3.1 rstudioapi_0.13 DelayedMatrixStats_1.12.3 farver_2.1.0
[37] generics_0.1.0 jsonlite_1.7.2 BiocParallel_1.24.1 RCurl_1.98-1.3 magrittr_2.0.1 BiocSingular_1.6.0
[43] GenomeInfoDbData_1.2.4 scuttle_1.0.4 Rhdf5lib_1.12.1 Rcpp_1.0.6 ggbeeswarm_0.6.0 munsell_0.5.0
[49] fansi_0.4.2 viridis_0.6.1 lifecycle_1.0.0 stringi_1.6.2 yaml_2.2.1 MASS_7.3-54
[55] zlibbioc_1.36.0 grid_4.0.5 crayon_1.4.1 lattice_0.20-44 haven_2.4.1 beachmat_2.6.4
[61] hms_1.1.0 knitr_1.33 pillar_1.6.1 reprex_2.0.0 glue_1.4.2 evaluate_0.14
[67] BiocManager_1.30.15 modelr_0.1.8 tweenr_1.0.2 vctrs_0.3.8 cellranger_1.1.0 polyclip_1.10-0
[73] gtable_0.3.0 assertthat_0.2.1 xfun_0.23 rsvd_1.0.5 broom_0.7.6 viridisLite_0.4.0
[79] tinytex_0.31 beeswarm_0.3.1 ellipsis_0.3.2